knitr::opts_chunk$set(echo = TRUE)
require("ggplot2")
## Loading required package: ggplot2
require ("plyr")
## Loading required package: plyr
require ("caTools")
## Loading required package: caTools

Employment Data Analysis

The secrets to getting ahead in this company:
      1.Work for many years with several different companies before coming to this company. Your Total Time Worked will provide you with a higher job level and higher monthly income.
      2.Change Managers frequently. The longer you stay with a manager the longer the time before you will receive you next promotion.
      3.Aspire and work toward being a manager or director. These job roles make more money. If you stay with the company long enough and have many years of total work time before coming to the company, you should be able to obtain one of these position.
      4.Do not work as a Sales Representative. This job role makes less money.
      5. Don’t worry about job performance ratings. Everyone receives either an outstanding or an excellent rating. The ratings have not impact on your monthly income.

DataSet

## Number of employees in the dataset:
## 1470
## Number of variables in the dataset:
## 35
## Number of employees who have left the company and the number if employees who remain:
##  Left Freq
##    No 1233
##   Yes  237
## 3 Variables had no variation,that is, are constants and will not aid in the analysis
## These variables are removed from the dataset.
## Number of variables in the dataset, after removing constants:
## 32
## Count of field types in the dataset:
## 
##  factor integer 
##       8      24

Job Role Pareto

ggplot(data=as.data.frame(table(attrit$Department,attrit$JobRole,dnn=list("Department","JobRole"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Department)) +
geom_bar(stat="identity") + coord_flip() + ggtitle("Number of Employees per Job Role by Department") + xlab("") + ylab("")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4")))

The Job Role “Manager” is the only Job Role that is in multiple departments
To aid analysis the values of JobRole for the Managers will be recoded
# Recode Managers
attrit$JobRole <-with(attrit, ifelse(attrit$JobRole == "Manager",paste(attrit$Department,attrit$JobRole), as.character(attrit$JobRole)))
attrit$JobRole <- as.factor(attrit$JobRole)
ggplot(data=as.data.frame(table(attrit$Department,attrit$JobRole,dnn=list("Department","JobRole"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Department)) +
geom_bar(stat="identity") + coord_flip() + ggtitle("Number of Employees per Job Role by Department \nafter recoding") + xlab("") + ylab("")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4")))

#Recode the variable "JobRole" to a numeric value for modeling
attrit$JRCode[attrit$JobRole=="Healthcare Representative"] <- 1L
attrit$JRCode[attrit$JobRole=="Human Resources"] <- 2L
attrit$JRCode[attrit$JobRole=="Laboratory Technician"] <- 3L
attrit$JRCode[attrit$JobRole=="Manufacturing Director"] <- 4L
attrit$JRCode[attrit$JobRole=="Research Director"] <- 5L
attrit$JRCode[attrit$JobRole=="Sales Executive"] <- 6L
attrit$JRCode[attrit$JobRole=="Research Scientist"] <- 7L
attrit$JRCode[attrit$JobRole=="Sales Representative"] <- 8L
attrit$JRCode[attrit$JobRole=="Human Resources Manager"] <- 9L
attrit$JRCode[attrit$JobRole=="Research & Development Manager"] <- 10L
0.0008
## [1] 8e-04
#Create a dataframe of current employees only
noattrit <- attrit[attrit$Attrition=="No",]
noattrit$Attrit <- NULL

Attrition

Attrition Analysis

First take a quick look at the data. We will name the main dataset used for the attrition portion “attrition”

attrition<-read.csv('CaseStudy2reorder.csv', header = T)
#View(attrition)
head(attrition)
##   X Attrition    BusinessTravel             Department EducationField
## 1 1       Yes     Travel_Rarely                  Sales  Life Sciences
## 2 2        No Travel_Frequently Research & Development  Life Sciences
## 3 3       Yes     Travel_Rarely Research & Development          Other
## 4 4        No Travel_Frequently Research & Development  Life Sciences
## 5 5        No     Travel_Rarely Research & Development        Medical
## 6 6        No Travel_Frequently Research & Development  Life Sciences
##   Gender               JobRole OverTime MaritalStatus Age DailyRate
## 1 Female       Sales Executive      Yes        Single  41      1102
## 2   Male    Research Scientist       No       Married  49       279
## 3   Male Laboratory Technician      Yes        Single  37      1373
## 4 Female    Research Scientist      Yes       Married  33      1392
## 5   Male Laboratory Technician       No       Married  27       591
## 6   Male Laboratory Technician       No        Single  32      1005
##   DistanceFromHome Education EmployeeNumber EnvironmentSatisfaction
## 1                1         2              1                       2
## 2                8         1              2                       3
## 3                2         2              4                       4
## 4                3         4              5                       4
## 5                2         1              7                       1
## 6                2         2              8                       4
##   HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome
## 1         94              3        2               4          5993
## 2         61              2        2               2          5130
## 3         92              2        1               3          2090
## 4         56              3        1               3          2909
## 5         40              3        1               2          3468
## 6         79              3        1               4          3068
##   MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating
## 1       19479                  8                11                 3
## 2       24907                  1                23                 4
## 3        2396                  6                15                 3
## 4       23159                  1                11                 3
## 5       16632                  9                12                 3
## 6       11864                  0                13                 3
##   RelationshipSatisfaction StockOptionLevel TotalWorkingYears
## 1                        1                0                 8
## 2                        4                1                10
## 3                        2                0                 7
## 4                        3                0                 8
## 5                        4                1                 6
## 6                        3                0                 8
##   TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 1                     0               1              6                  4
## 2                     3               3             10                  7
## 3                     3               3              0                  0
## 4                     3               3              8                  7
## 5                     3               3              2                  2
## 6                     2               2              7                  7
##   YearsSinceLastPromotion YearsWithCurrManager
## 1                       0                    5
## 2                       1                    7
## 3                       0                    0
## 4                       3                    0
## 5                       2                    2
## 6                       3                    6

Drop the first Column, attach dataset for ease of variable recall

attrition<-attrition[,-1] # 1st col obsolete 
str(attrition)
## 'data.frame':    1470 obs. of  32 variables:
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
##  $ Age                     : int  41 49 37 33 27 32 59 30 38 36 ...
##  $ DailyRate               : int  1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
##  $ DistanceFromHome        : int  1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : int  2 1 2 4 1 2 3 1 3 3 ...
##  $ EmployeeNumber          : int  1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : int  2 3 4 4 1 4 3 4 4 3 ...
##  $ HourlyRate              : int  94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : int  3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : int  2 2 1 1 1 1 1 1 3 2 ...
##  $ JobSatisfaction         : int  4 2 3 3 2 4 1 3 3 3 ...
##  $ MonthlyIncome           : int  5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
##  $ MonthlyRate             : int  19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
##  $ NumCompaniesWorked      : int  8 1 6 1 9 0 4 1 0 6 ...
##  $ PercentSalaryHike       : int  11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : int  3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: int  1 4 2 3 4 3 1 2 2 2 ...
##  $ StockOptionLevel        : int  0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : int  8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : int  0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : int  1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : int  4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : int  0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : int  5 7 0 0 2 6 0 0 8 7 ...
summary(attrition)
##  Attrition            BusinessTravel                  Department 
##  No :1233   Non-Travel       : 150   Human Resources       : 63  
##  Yes: 237   Travel_Frequently: 277   Research & Development:961  
##             Travel_Rarely    :1043   Sales                 :446  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##           EducationField    Gender                         JobRole   
##  Human Resources : 27    Female:588   Sales Executive          :326  
##  Life Sciences   :606    Male  :882   Research Scientist       :292  
##  Marketing       :159                 Laboratory Technician    :259  
##  Medical         :464                 Manufacturing Director   :145  
##  Other           : 82                 Healthcare Representative:131  
##  Technical Degree:132                 Manager                  :102  
##                                       (Other)                  :215  
##  OverTime    MaritalStatus      Age          DailyRate     
##  No :1054   Divorced:327   Min.   :18.00   Min.   : 102.0  
##  Yes: 416   Married :673   1st Qu.:30.00   1st Qu.: 465.0  
##             Single  :470   Median :36.00   Median : 802.0  
##                            Mean   :36.92   Mean   : 802.5  
##                            3rd Qu.:43.00   3rd Qu.:1157.0  
##                            Max.   :60.00   Max.   :1499.0  
##                                                            
##  DistanceFromHome   Education     EmployeeNumber   EnvironmentSatisfaction
##  Min.   : 1.000   Min.   :1.000   Min.   :   1.0   Min.   :1.000          
##  1st Qu.: 2.000   1st Qu.:2.000   1st Qu.: 491.2   1st Qu.:2.000          
##  Median : 7.000   Median :3.000   Median :1020.5   Median :3.000          
##  Mean   : 9.193   Mean   :2.913   Mean   :1024.9   Mean   :2.722          
##  3rd Qu.:14.000   3rd Qu.:4.000   3rd Qu.:1555.8   3rd Qu.:4.000          
##  Max.   :29.000   Max.   :5.000   Max.   :2068.0   Max.   :4.000          
##                                                                           
##    HourlyRate     JobInvolvement    JobLevel     JobSatisfaction
##  Min.   : 30.00   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 48.00   1st Qu.:2.00   1st Qu.:1.000   1st Qu.:2.000  
##  Median : 66.00   Median :3.00   Median :2.000   Median :3.000  
##  Mean   : 65.89   Mean   :2.73   Mean   :2.064   Mean   :2.729  
##  3rd Qu.: 83.75   3rd Qu.:3.00   3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :100.00   Max.   :4.00   Max.   :5.000   Max.   :4.000  
##                                                                 
##  MonthlyIncome    MonthlyRate    NumCompaniesWorked PercentSalaryHike
##  Min.   : 1009   Min.   : 2094   Min.   :0.000      Min.   :11.00    
##  1st Qu.: 2911   1st Qu.: 8047   1st Qu.:1.000      1st Qu.:12.00    
##  Median : 4919   Median :14236   Median :2.000      Median :14.00    
##  Mean   : 6503   Mean   :14313   Mean   :2.693      Mean   :15.21    
##  3rd Qu.: 8379   3rd Qu.:20462   3rd Qu.:4.000      3rd Qu.:18.00    
##  Max.   :19999   Max.   :26999   Max.   :9.000      Max.   :25.00    
##                                                                      
##  PerformanceRating RelationshipSatisfaction StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :1.0000  
##  Mean   :3.154     Mean   :2.712            Mean   :0.7939  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :3.0000  
##                                                             
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.28     Mean   :2.799         Mean   :2.761   Mean   : 7.008  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.: 9.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##                                                                          
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 3.000     Median : 1.000          Median : 3.000      
##  Mean   : 4.229     Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :18.000     Max.   :15.000          Max.   :17.000      
## 

Some of the categorical variables will not work in an analisys because they are not factors. Convert them to factors.

attrition$Education<-as.factor(attrition$Education) 
attrition$EnvironmentSatisfaction<-as.factor(attrition$EnvironmentSatisfaction)  
attrition$JobInvolvement<-as.factor(attrition$JobInvolvement)  
attrition$JobLevel<-as.factor(attrition$JobLevel)  
attrition$JobSatisfaction<-as.factor(attrition$JobSatisfaction)  
attrition$PerformanceRating<-as.factor(attrition$PerformanceRating)  
attrition$RelationshipSatisfaction<-as.factor(attrition$RelationshipSatisfaction)  
attrition$StockOptionLevel<-as.factor(attrition$StockOptionLevel)  
attrition$WorkLifeBalance<-as.factor(attrition$WorkLifeBalance)  

*(comment: Joblevel description was missing from the original dataset, so we only have the category number).

View a Summary of the data

summary(attrition)
##  Attrition            BusinessTravel                  Department 
##  No :1233   Non-Travel       : 150   Human Resources       : 63  
##  Yes: 237   Travel_Frequently: 277   Research & Development:961  
##             Travel_Rarely    :1043   Sales                 :446  
##                                                                  
##                                                                  
##                                                                  
##                                                                  
##           EducationField    Gender                         JobRole   
##  Human Resources : 27    Female:588   Sales Executive          :326  
##  Life Sciences   :606    Male  :882   Research Scientist       :292  
##  Marketing       :159                 Laboratory Technician    :259  
##  Medical         :464                 Manufacturing Director   :145  
##  Other           : 82                 Healthcare Representative:131  
##  Technical Degree:132                 Manager                  :102  
##                                       (Other)                  :215  
##  OverTime    MaritalStatus      Age          DailyRate     
##  No :1054   Divorced:327   Min.   :18.00   Min.   : 102.0  
##  Yes: 416   Married :673   1st Qu.:30.00   1st Qu.: 465.0  
##             Single  :470   Median :36.00   Median : 802.0  
##                            Mean   :36.92   Mean   : 802.5  
##                            3rd Qu.:43.00   3rd Qu.:1157.0  
##                            Max.   :60.00   Max.   :1499.0  
##                                                            
##  DistanceFromHome Education EmployeeNumber   EnvironmentSatisfaction
##  Min.   : 1.000   1:170     Min.   :   1.0   1:284                  
##  1st Qu.: 2.000   2:282     1st Qu.: 491.2   2:287                  
##  Median : 7.000   3:572     Median :1020.5   3:453                  
##  Mean   : 9.193   4:398     Mean   :1024.9   4:446                  
##  3rd Qu.:14.000   5: 48     3rd Qu.:1555.8                          
##  Max.   :29.000             Max.   :2068.0                          
##                                                                     
##    HourlyRate     JobInvolvement JobLevel JobSatisfaction MonthlyIncome  
##  Min.   : 30.00   1: 83          1:543    1:289           Min.   : 1009  
##  1st Qu.: 48.00   2:375          2:534    2:280           1st Qu.: 2911  
##  Median : 66.00   3:868          3:218    3:442           Median : 4919  
##  Mean   : 65.89   4:144          4:106    4:459           Mean   : 6503  
##  3rd Qu.: 83.75                  5: 69                    3rd Qu.: 8379  
##  Max.   :100.00                                           Max.   :19999  
##                                                                          
##   MonthlyRate    NumCompaniesWorked PercentSalaryHike PerformanceRating
##  Min.   : 2094   Min.   :0.000      Min.   :11.00     3:1244           
##  1st Qu.: 8047   1st Qu.:1.000      1st Qu.:12.00     4: 226           
##  Median :14236   Median :2.000      Median :14.00                      
##  Mean   :14313   Mean   :2.693      Mean   :15.21                      
##  3rd Qu.:20462   3rd Qu.:4.000      3rd Qu.:18.00                      
##  Max.   :26999   Max.   :9.000      Max.   :25.00                      
##                                                                        
##  RelationshipSatisfaction StockOptionLevel TotalWorkingYears
##  1:276                    0:631            Min.   : 0.00    
##  2:303                    1:596            1st Qu.: 6.00    
##  3:459                    2:158            Median :10.00    
##  4:432                    3: 85            Mean   :11.28    
##                                            3rd Qu.:15.00    
##                                            Max.   :40.00    
##                                                             
##  TrainingTimesLastYear WorkLifeBalance YearsAtCompany   YearsInCurrentRole
##  Min.   :0.000         1: 80           Min.   : 0.000   Min.   : 0.000    
##  1st Qu.:2.000         2:344           1st Qu.: 3.000   1st Qu.: 2.000    
##  Median :3.000         3:893           Median : 5.000   Median : 3.000    
##  Mean   :2.799         4:153           Mean   : 7.008   Mean   : 4.229    
##  3rd Qu.:3.000                         3rd Qu.: 9.000   3rd Qu.: 7.000    
##  Max.   :6.000                         Max.   :40.000   Max.   :18.000    
##                                                                           
##  YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 1.000          Median : 3.000      
##  Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :15.000          Max.   :17.000      
## 
# attach for ease of running code 
attach(attrition)

Some initial things to note here:

  • Only 16% attrition in the data
  • BusinessTravel, OverTime, JobInvolvement, PerformanceRating, seem to be unbalanced. All other Variables seem balanced.
  • YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager may be skewed

Need logistic regression

*** The next several steps are the model fitting steps

Model fitting (Steps 1~zz)

1. Split the data into train and test data
We want to be able to create an appropriate model the test that model on a subset of the data. We will create a “training” dataset and a “test” one.
set.seed(123)
split <- sample.split(attrition$Attrition, SplitRatio = 0.80)
#get training and test data
data.train <- subset(attrition, split == TRUE)
data.test <- subset(attrition, split == FALSE)
2. Define null + full models for the training data
We need a NULL model (with no variables) to compare our training model to. We will compare the Deviance and AIC of the NULL Model vs the training model. We are looking for significant drop in both of these stats as a validation that the model is working.
options(width = 300) ## print 
null.model.train<-glm(formula=Attrition~1, family = binomial(link="logit"), data=data.train)
full.model.train<-glm(formula=Attrition~., family = binomial(link="logit"), data=data.train)
summary(full.model.train)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial(link = "logit"), 
##     data = data.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8888  -0.4561  -0.1967  -0.0584   3.6739  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -9.397e+00  6.424e+02  -0.015 0.988330    
## BusinessTravelTravel_Frequently   1.973e+00  4.971e-01   3.970 7.19e-05 ***
## BusinessTravelTravel_Rarely       9.463e-01  4.612e-01   2.052 0.040162 *  
## DepartmentResearch & Development  1.347e+01  6.424e+02   0.021 0.983277    
## DepartmentSales                   1.347e+01  6.424e+02   0.021 0.983274    
## EducationFieldLife Sciences      -9.159e-01  9.705e-01  -0.944 0.345297    
## EducationFieldMarketing          -5.541e-01  1.026e+00  -0.540 0.589208    
## EducationFieldMedical            -1.061e+00  9.657e-01  -1.099 0.271850    
## EducationFieldOther              -1.062e+00  1.052e+00  -1.010 0.312496    
## EducationFieldTechnical Degree   -1.206e-01  9.912e-01  -0.122 0.903150    
## GenderMale                        4.060e-01  2.172e-01   1.869 0.061656 .  
## JobRoleHuman Resources            1.378e+01  6.424e+02   0.021 0.982890    
## JobRoleLaboratory Technician      6.647e-01  6.741e-01   0.986 0.324097    
## JobRoleManager                   -1.572e-01  1.220e+00  -0.129 0.897457    
## JobRoleManufacturing Director     5.664e-01  6.402e-01   0.885 0.376268    
## JobRoleResearch Director         -2.135e+00  1.388e+00  -1.538 0.124131    
## JobRoleResearch Scientist        -6.835e-01  7.058e-01  -0.968 0.332903    
## JobRoleSales Executive            1.434e+00  1.450e+00   0.989 0.322653    
## JobRoleSales Representative       1.165e+00  1.528e+00   0.763 0.445612    
## OverTimeYes                       2.066e+00  2.357e-01   8.769  < 2e-16 ***
## MaritalStatusMarried              3.710e-01  3.209e-01   1.156 0.247708    
## MaritalStatusSingle               6.425e-01  4.564e-01   1.408 0.159182    
## Age                              -2.812e-02  1.589e-02  -1.770 0.076713 .  
## DailyRate                        -3.854e-04  2.606e-04  -1.479 0.139271    
## DistanceFromHome                  5.149e-02  1.286e-02   4.003 6.26e-05 ***
## Education2                        3.560e-01  3.918e-01   0.909 0.363550    
## Education3                        2.119e-01  3.502e-01   0.605 0.545108    
## Education4                        3.073e-01  3.747e-01   0.820 0.412153    
## Education5                        2.933e-01  7.307e-01   0.401 0.688090    
## EmployeeNumber                   -1.677e-04  1.839e-04  -0.912 0.361818    
## EnvironmentSatisfaction2         -1.334e+00  3.360e-01  -3.969 7.22e-05 ***
## EnvironmentSatisfaction3         -1.250e+00  2.980e-01  -4.195 2.73e-05 ***
## EnvironmentSatisfaction4         -1.410e+00  2.990e-01  -4.716 2.40e-06 ***
## HourlyRate                        4.870e-03  5.254e-03   0.927 0.354039    
## JobInvolvement2                  -1.309e+00  4.309e-01  -3.037 0.002388 ** 
## JobInvolvement3                  -1.681e+00  4.089e-01  -4.112 3.93e-05 ***
## JobInvolvement4                  -2.041e+00  5.415e-01  -3.769 0.000164 ***
## JobLevel2                        -1.677e+00  5.263e-01  -3.186 0.001444 ** 
## JobLevel3                         3.282e-01  8.144e-01   0.403 0.686981    
## JobLevel4                        -6.879e-01  1.400e+00  -0.491 0.623217    
## JobLevel5                         1.452e+00  1.975e+00   0.735 0.462083    
## JobSatisfaction2                 -4.466e-01  3.169e-01  -1.409 0.158823    
## JobSatisfaction3                 -5.249e-01  2.866e-01  -1.831 0.067051 .  
## JobSatisfaction4                 -1.214e+00  3.064e-01  -3.961 7.47e-05 ***
## MonthlyIncome                    -1.526e-04  1.070e-04  -1.425 0.154053    
## MonthlyRate                       1.950e-05  1.482e-05   1.316 0.188204    
## NumCompaniesWorked                2.007e-01  4.632e-02   4.332 1.48e-05 ***
## PercentSalaryHike                -3.810e-02  4.673e-02  -0.815 0.414835    
## PerformanceRating4                5.542e-02  4.846e-01   0.114 0.908947    
## RelationshipSatisfaction2        -8.635e-01  3.411e-01  -2.532 0.011356 *  
## RelationshipSatisfaction3        -8.888e-01  3.029e-01  -2.934 0.003344 ** 
## RelationshipSatisfaction4        -9.020e-01  3.004e-01  -3.003 0.002671 ** 
## StockOptionLevel1                -1.050e+00  3.627e-01  -2.894 0.003803 ** 
## StockOptionLevel2                -1.066e+00  4.908e-01  -2.172 0.029866 *  
## StockOptionLevel3                -5.810e-01  5.815e-01  -0.999 0.317755    
## TotalWorkingYears                -6.285e-02  3.547e-02  -1.772 0.076398 .  
## TrainingTimesLastYear            -1.377e-01  8.333e-02  -1.652 0.098548 .  
## WorkLifeBalance2                 -1.040e+00  4.259e-01  -2.441 0.014628 *  
## WorkLifeBalance3                 -1.571e+00  3.999e-01  -3.930 8.51e-05 ***
## WorkLifeBalance4                 -1.262e+00  4.845e-01  -2.604 0.009216 ** 
## YearsAtCompany                    1.210e-01  4.956e-02   2.442 0.014605 *  
## YearsInCurrentRole               -1.620e-01  5.740e-02  -2.823 0.004758 ** 
## YearsSinceLastPromotion           1.455e-01  5.012e-02   2.904 0.003690 ** 
## YearsWithCurrManager             -1.246e-01  5.428e-02  -2.295 0.021737 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1040.18  on 1175  degrees of freedom
## Residual deviance:  633.03  on 1112  degrees of freedom
## AIC: 761.03
## 
## Number of Fisher Scoring iterations: 15
There are a few things to note here:
  • We can see the far-right column with asterisks representing the lowest p-values. Since this is the whole model, and it has not been “fitted,” the values here represent individual variables, that, on their own, they each appear to have some significance and explain Attrition.
  • The “Estimate” column gies us the Log Odds (logit scale). This will back-transformed in a later step to actual Odds-Ratio, which is the probability that a covariate value influences Attrition in terms of Odds.
  • The Null Deviance (1040.18) is a measure of lack of fit of the model without any variables.
  • The Residual Deviance (633.03) is a measure of lack of fit of the model with all variables included.
  • AIC (761.03) is a measure of how well the model is fitting. We want to keep this as low as possible.
Sorting Model by p-value descending to view important covariates
#sort summary by p-value descending
tempDF<-as.data.frame(summary(full.model.train)$coefficients)
tempDF[order(tempDF$`Pr(>|z|)`),]
##                                       Estimate   Std. Error     z value     Pr(>|z|)
## OverTimeYes                       2.066496e+00 2.356697e-01  8.76861346 1.808798e-18
## EnvironmentSatisfaction4         -1.410304e+00 2.990337e-01 -4.71620534 2.402839e-06
## NumCompaniesWorked                2.006728e-01 4.632009e-02  4.33230549 1.475560e-05
## EnvironmentSatisfaction3         -1.250120e+00 2.980255e-01 -4.19467519 2.732629e-05
## JobInvolvement3                  -1.681390e+00 4.089481e-01 -4.11150059 3.930959e-05
## DistanceFromHome                  5.148581e-02 1.286262e-02  4.00274540 6.261167e-05
## BusinessTravelTravel_Frequently   1.973438e+00 4.970795e-01  3.97006641 7.185260e-05
## EnvironmentSatisfaction2         -1.333660e+00 3.360157e-01 -3.96904008 7.216274e-05
## JobSatisfaction4                 -1.213550e+00 3.063924e-01 -3.96077056 7.470829e-05
## WorkLifeBalance3                 -1.571416e+00 3.998989e-01 -3.92953309 8.511096e-05
## JobInvolvement4                  -2.040537e+00 5.414673e-01 -3.76853114 1.642110e-04
## JobLevel2                        -1.676527e+00 5.262505e-01 -3.18579714 1.443558e-03
## JobInvolvement2                  -1.308709e+00 4.308921e-01 -3.03720794 2.387807e-03
## RelationshipSatisfaction4        -9.020471e-01 3.003511e-01 -3.00330930 2.670609e-03
## RelationshipSatisfaction3        -8.887712e-01 3.029005e-01 -2.93420155 3.344070e-03
## YearsSinceLastPromotion           1.455322e-01 5.012251e-02  2.90352946 3.689823e-03
## StockOptionLevel1                -1.049812e+00 3.627458e-01 -2.89407158 3.802815e-03
## YearsInCurrentRole               -1.620440e-01 5.740223e-02 -2.82295655 4.758302e-03
## WorkLifeBalance4                 -1.261649e+00 4.845143e-01 -2.60394504 9.215753e-03
## RelationshipSatisfaction2        -8.634599e-01 3.410797e-01 -2.53154875 1.135600e-02
## YearsAtCompany                    1.210194e-01 4.955692e-02  2.44202874 1.460498e-02
## WorkLifeBalance2                 -1.039934e+00 4.259488e-01 -2.44145443 1.462823e-02
## YearsWithCurrManager             -1.245621e-01 5.427698e-02 -2.29493430 2.173689e-02
## StockOptionLevel2                -1.065877e+00 4.907660e-01 -2.17186337 2.986597e-02
## BusinessTravelTravel_Rarely       9.463215e-01 4.611534e-01  2.05207533 4.016234e-02
## GenderMale                        4.059671e-01 2.172388e-01  1.86875916 6.165633e-02
## JobSatisfaction3                 -5.249475e-01 2.866475e-01 -1.83133497 6.705056e-02
## TotalWorkingYears                -6.285103e-02 3.546935e-02 -1.77198141 7.639764e-02
## Age                              -2.811934e-02 1.588587e-02 -1.77008462 7.671304e-02
## TrainingTimesLastYear            -1.376637e-01 8.333496e-02 -1.65193159 9.854850e-02
## JobRoleResearch Director         -2.134856e+00 1.388377e+00 -1.53766315 1.241310e-01
## DailyRate                        -3.853517e-04 2.606352e-04 -1.47850948 1.392715e-01
## MonthlyIncome                    -1.525736e-04 1.070422e-04 -1.42535949 1.540533e-01
## JobSatisfaction2                 -4.465805e-01 3.169395e-01 -1.40904014 1.588233e-01
## MaritalStatusSingle               6.425277e-01 4.563969e-01  1.40782657 1.591824e-01
## MonthlyRate                       1.950394e-05 1.482162e-05  1.31591095 1.882039e-01
## MaritalStatusMarried              3.709706e-01 3.209272e-01  1.15593375 2.477083e-01
## EducationFieldMedical            -1.061073e+00 9.656534e-01 -1.09881307 2.718496e-01
## EducationFieldOther              -1.062406e+00 1.051888e+00 -1.00999854 3.124960e-01
## StockOptionLevel3                -5.809903e-01 5.815237e-01 -0.99908270 3.177546e-01
## JobRoleSales Executive            1.434065e+00 1.449984e+00  0.98902152 3.226526e-01
## JobRoleLaboratory Technician      6.647463e-01 6.741342e-01  0.98607410 3.240968e-01
## JobRoleResearch Scientist        -6.834538e-01 7.058412e-01 -0.96828274 3.329032e-01
## EducationFieldLife Sciences      -9.159258e-01 9.705168e-01 -0.94375056 3.452971e-01
## HourlyRate                        4.869592e-03 5.254288e-03  0.92678425 3.540386e-01
## EmployeeNumber                   -1.677276e-04 1.839306e-04 -0.91190664 3.618179e-01
## Education2                        3.559715e-01 3.917708e-01  0.90862200 3.635497e-01
## JobRoleManufacturing Director     5.664383e-01 6.401926e-01  0.88479366 3.762679e-01
## Education4                        3.073242e-01 3.747351e-01  0.82011057 4.121531e-01
## PercentSalaryHike                -3.810140e-02 4.672639e-02 -0.81541506 4.148348e-01
## JobRoleSales Representative       1.165268e+00 1.527716e+00  0.76275124 4.456118e-01
## JobLevel5                         1.452142e+00 1.974573e+00  0.73542100 4.620831e-01
## Education3                        2.119179e-01 3.502153e-01  0.60510745 5.451076e-01
## EducationFieldMarketing          -5.540912e-01 1.026125e+00 -0.53998403 5.892080e-01
## JobLevel4                        -6.878840e-01 1.400140e+00 -0.49129655 6.232167e-01
## JobLevel3                         3.281793e-01 8.144302e-01  0.40295569 6.869808e-01
## Education5                        2.933429e-01 7.307101e-01  0.40144909 6.880895e-01
## JobRoleManager                   -1.572259e-01 1.219999e+00 -0.12887382 8.974575e-01
## EducationFieldTechnical Degree   -1.206066e-01 9.911554e-01 -0.12168282 9.031502e-01
## PerformanceRating4                5.542427e-02 4.846185e-01  0.11436679 9.089470e-01
## JobRoleHuman Resources            1.377763e+01 6.424413e+02  0.02144574 9.828901e-01
## DepartmentSales                   1.346853e+01 6.424410e+02  0.02096462 9.832739e-01
## DepartmentResearch & Development  1.346568e+01 6.424409e+02  0.02096018 9.832774e-01
## (Intercept)                      -9.396688e+00 6.424429e+02 -0.01462650 9.883302e-01
We can see the top variables, that, on their own, seem to have an effect in Attrition.
3. Run ANOVA
We want to run an ANOVA To further understand the impact of each variable into the model.
anova(full.model.train, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Attrition
## 
## Terms added sequentially (first to last)
## 
## 
##                          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                      1175    1040.18              
## BusinessTravel            2   18.868      1173    1021.32 7.995e-05 ***
## Department                2    8.692      1171    1012.62 0.0129553 *  
## EducationField            5    6.340      1166    1006.28 0.2744831    
## Gender                    1    0.637      1165    1005.65 0.4246324    
## JobRole                   8   69.556      1157     936.09 6.024e-12 ***
## OverTime                  1   61.827      1156     874.26 3.751e-15 ***
## MaritalStatus             2   32.054      1154     842.21 1.095e-07 ***
## Age                       1    5.387      1153     836.82 0.0202852 *  
## DailyRate                 1    3.178      1152     833.64 0.0746336 .  
## DistanceFromHome          1   10.304      1151     823.34 0.0013276 ** 
## Education                 4    1.753      1147     821.59 0.7810091    
## EmployeeNumber            1    0.109      1146     821.48 0.7416105    
## EnvironmentSatisfaction   3   25.105      1143     796.37 1.468e-05 ***
## HourlyRate                1    0.090      1142     796.28 0.7642039    
## JobInvolvement            3   19.187      1139     777.10 0.0002501 ***
## JobLevel                  4   40.262      1135     736.83 3.821e-08 ***
## JobSatisfaction           3   18.479      1132     718.35 0.0003503 ***
## MonthlyIncome             1    4.053      1131     714.30 0.0440873 *  
## MonthlyRate               1    3.116      1130     711.18 0.0775034 .  
## NumCompaniesWorked        1   15.350      1129     695.83 8.932e-05 ***
## PercentSalaryHike         1    1.739      1128     694.10 0.1873082    
## PerformanceRating         1    0.004      1127     694.09 0.9465417    
## RelationshipSatisfaction  3   10.130      1124     683.96 0.0174921 *  
## StockOptionLevel          3    8.053      1121     675.91 0.0449197 *  
## TotalWorkingYears         1    1.976      1120     673.93 0.1598558    
## TrainingTimesLastYear     1    2.900      1119     671.03 0.0885629 .  
## WorkLifeBalance           3   15.866      1116     655.17 0.0012078 ** 
## YearsAtCompany            1    0.917      1115     654.25 0.3382468    
## YearsInCurrentRole        1    8.610      1114     645.64 0.0033438 ** 
## YearsSinceLastPromotion   1    7.425      1113     638.21 0.0064314 ** 
## YearsWithCurrManager      1    5.180      1112     633.03 0.0228466 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
If we take a look at the Residual Deviance column of ANOVA above, we can get a feel for how much impact a single variable has in the model. The ANOVA calculates the Residual Deviance (lack of fit) that each variable contributes to. Each variable is introduced in order of appearance in dataset (column-wise). For instance, starting from Null model (1040.18), when the first variable, “BusinessTravel” is introduced, there is a drop of Deviance of 18 points. Later variables may contribute more or less to the overall Deviance. The variables that drop the Deviance to the greatest degree, are the ones that may be heavily contributing to Attrition.
4. Fitting the model using STEP
We will use STEP as selection procedure (direction=“both”), which starts with a NULL model and adding one variable at the time, validating the impact. If the variable is not meaningful it will be discarded. This process is repeated until we go thru all the variables. (Long output ommited, only last few lines shown here for brevity).
## step(null.model.train, scope=list(upper=full.model.train), direction="both", test="Chisq", data=data.train)
## 
## . . . 
## < many steps later >
## . . . 
##                            Df Deviance    AIC    LRT  Pr(>Chi)    
## <none>                          653.05 745.05                     
## ...
## - DistanceFromHome          1   668.28 758.28 15.224 9.547e-05 ***
## - JobSatisfaction           3   672.87 758.87 19.820 0.0001850 ***
## - JobInvolvement            3   673.25 759.25 20.195 0.0001546 ***
## - NumCompaniesWorked        1   672.21 762.21 19.153 1.207e-05 ***
## - BusinessTravel            2   675.76 763.76 22.705 1.174e-05 ***
## - EnvironmentSatisfaction   3   683.97 769.97 30.921 8.831e-07 ***
## - JobLevel                  4   686.83 770.83 33.779 8.271e-07 ***
## - JobRole                   8   704.21 780.21 51.153 2.452e-08 ***
## - OverTime                  1   743.18 833.18 90.130 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## . . .
## Degrees of Freedom: 1175 Total (i.e. Null);  1130 Residual
## Null Deviance:       1040 
## Residual Deviance: 649.2     AIC: 741.2
The output of the Step procedure above, shows most significant variables (in descending order). We will choose these top 9 (lowest p-value) variables for our Final Training Model.
5. Create Final Training Model with top variables (lowest p-value).
final.model.train<-glm(formula=Attrition~OverTime+JobRole+JobLevel+EnvironmentSatisfaction+BusinessTravel+NumCompaniesWorked+JobInvolvement+JobSatisfaction+DistanceFromHome, family = binomial(link="logit"), data=data.train)
summary(final.model.train)
## 
## Call:
## glm(formula = Attrition ~ OverTime + JobRole + JobLevel + EnvironmentSatisfaction + 
##     BusinessTravel + NumCompaniesWorked + JobInvolvement + JobSatisfaction + 
##     DistanceFromHome, family = binomial(link = "logit"), data = data.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7672  -0.5318  -0.2972  -0.1384   2.9413  
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -0.376960   0.809049  -0.466 0.641266    
## OverTimeYes                      1.703728   0.199136   8.556  < 2e-16 ***
## JobRoleHuman Resources           0.282504   0.679131   0.416 0.677426    
## JobRoleLaboratory Technician     0.439641   0.592724   0.742 0.458250    
## JobRoleManager                  -0.971051   0.989569  -0.981 0.326452    
## JobRoleManufacturing Director    0.004675   0.569080   0.008 0.993446    
## JobRoleResearch Director        -2.384876   1.190992  -2.002 0.045239 *  
## JobRoleResearch Scientist       -0.545522   0.612119  -0.891 0.372821    
## JobRoleSales Executive           1.296621   0.457348   2.835 0.004581 ** 
## JobRoleSales Representative      1.080430   0.647221   1.669 0.095051 .  
## JobLevel2                       -2.075147   0.420737  -4.932 8.13e-07 ***
## JobLevel3                       -1.142405   0.497756  -2.295 0.021727 *  
## JobLevel4                       -1.998374   0.727256  -2.748 0.005999 ** 
## JobLevel5                       -0.617082   1.160975  -0.532 0.595058    
## EnvironmentSatisfaction2        -1.236091   0.298756  -4.137 3.51e-05 ***
## EnvironmentSatisfaction3        -1.098946   0.253043  -4.343 1.41e-05 ***
## EnvironmentSatisfaction4        -1.200238   0.258863  -4.637 3.54e-06 ***
## BusinessTravelTravel_Frequently  1.379870   0.428882   3.217 0.001294 ** 
## BusinessTravelTravel_Rarely      0.593573   0.401833   1.477 0.139632    
## NumCompaniesWorked               0.107992   0.037124   2.909 0.003627 ** 
## JobInvolvement2                 -1.063282   0.375174  -2.834 0.004595 ** 
## JobInvolvement3                 -1.540777   0.354925  -4.341 1.42e-05 ***
## JobInvolvement4                 -2.007050   0.475901  -4.217 2.47e-05 ***
## JobSatisfaction2                -0.263847   0.285172  -0.925 0.354851    
## JobSatisfaction3                -0.407503   0.254084  -1.604 0.108755    
## JobSatisfaction4                -1.076902   0.272780  -3.948 7.88e-05 ***
## DistanceFromHome                 0.037731   0.011219   3.363 0.000771 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1040.18  on 1175  degrees of freedom
## Residual deviance:  757.71  on 1149  degrees of freedom
## AIC: 811.71
## 
## Number of Fisher Scoring iterations: 7
Sort the model by p-value
#sort summary by p-value descending
tempDF<-as.data.frame(summary(final.model.train)$coefficients)
tempDF[order(tempDF$`Pr(>|z|)`),]
##                                     Estimate Std. Error      z value     Pr(>|z|)
## OverTimeYes                      1.703728051 0.19913586  8.555606350 1.172508e-17
## JobLevel2                       -2.075147485 0.42073695 -4.932173176 8.131977e-07
## EnvironmentSatisfaction4        -1.200237626 0.25886288 -4.636576725 3.542267e-06
## EnvironmentSatisfaction3        -1.098945522 0.25304310 -4.342918298 1.406024e-05
## JobInvolvement3                 -1.540776733 0.35492533 -4.341129219 1.417523e-05
## JobInvolvement4                 -2.007049927 0.47590144 -4.217364685 2.471741e-05
## EnvironmentSatisfaction2        -1.236091203 0.29875636 -4.137455719 3.511782e-05
## JobSatisfaction4                -1.076902408 0.27277952 -3.947885866 7.884437e-05
## DistanceFromHome                 0.037731054 0.01121887  3.363179102 7.705036e-04
## BusinessTravelTravel_Frequently  1.379869980 0.42888209  3.217364400 1.293742e-03
## NumCompaniesWorked               0.107991628 0.03712439  2.908912829 3.626880e-03
## JobRoleSales Executive           1.296621357 0.45734763  2.835089280 4.581288e-03
## JobInvolvement2                 -1.063282476 0.37517447 -2.834101357 4.595475e-03
## JobLevel4                       -1.998374192 0.72725593 -2.747827989 5.999147e-03
## JobLevel3                       -1.142405118 0.49775586 -2.295111348 2.172674e-02
## JobRoleResearch Director        -2.384876479 1.19099203 -2.002428577 4.523866e-02
## JobRoleSales Representative      1.080429892 0.64722135  1.669335991 9.505081e-02
## JobSatisfaction3                -0.407503426 0.25408361 -1.603816278 1.087546e-01
## BusinessTravelTravel_Rarely      0.593573311 0.40183349  1.477162370 1.396321e-01
## JobRoleManager                  -0.971051006 0.98956937 -0.981286438 3.264515e-01
## JobSatisfaction2                -0.263847412 0.28517207 -0.925221783 3.548506e-01
## JobRoleResearch Scientist       -0.545521561 0.61211892 -0.891201924 3.728209e-01
## JobRoleLaboratory Technician     0.439641378 0.59272358  0.741730868 4.582504e-01
## JobLevel5                       -0.617081935 1.16097506 -0.531520405 5.950582e-01
## (Intercept)                     -0.376959731 0.80904868 -0.465929603 6.412659e-01
## JobRoleHuman Resources           0.282503674 0.67913145  0.415977897 6.774262e-01
## JobRoleManufacturing Director    0.004674672 0.56908005  0.008214437 9.934459e-01
6. Test Final Training Model Against Test data
We are looking for a measure of how accurate this Training Model is against a subset of the data.
predict.results <- predict(final.model.train,newdata=subset(data.test,select=c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),type='response')
predict.results <- ifelse(predict.results > 0.5,1,0)
misClasificError <- mean(predict.results)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.918367346938776"
We see a 91% accuracy of the model against test data, so we are good.
6. Run Final model against all data.
Now we ned to run the Final model against all the data.
final.model.alldata<-glm(formula=Attrition~OverTime+JobRole+JobLevel+EnvironmentSatisfaction+BusinessTravel+NumCompaniesWorked+JobInvolvement+JobSatisfaction+DistanceFromHome, family = binomial(link="logit"), data=attrition)
summary(final.model.alldata)
## 
## Call:
## glm(formula = Attrition ~ OverTime + JobRole + JobLevel + EnvironmentSatisfaction + 
##     BusinessTravel + NumCompaniesWorked + JobInvolvement + JobSatisfaction + 
##     DistanceFromHome, family = binomial(link = "logit"), data = attrition)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7212  -0.5375  -0.3002  -0.1340   3.0712  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     -0.83400    0.71661  -1.164 0.244499    
## OverTimeYes                      1.76957    0.17760   9.964  < 2e-16 ***
## JobRoleHuman Resources           0.47126    0.61437   0.767 0.443041    
## JobRoleLaboratory Technician     0.31962    0.53750   0.595 0.552085    
## JobRoleManager                  -0.99151    0.92744  -1.069 0.285033    
## JobRoleManufacturing Director    0.05172    0.51121   0.101 0.919414    
## JobRoleResearch Director        -2.22301    1.03063  -2.157 0.031010 *  
## JobRoleResearch Scientist       -0.44267    0.55256  -0.801 0.423059    
## JobRoleSales Executive           1.34413    0.41115   3.269 0.001078 ** 
## JobRoleSales Representative      1.09145    0.58897   1.853 0.063863 .  
## JobLevel2                       -2.08559    0.38254  -5.452 4.98e-08 ***
## JobLevel3                       -1.38643    0.45075  -3.076 0.002099 ** 
## JobLevel4                       -2.16495    0.68233  -3.173 0.001509 ** 
## JobLevel5                       -0.25018    1.02437  -0.244 0.807054    
## EnvironmentSatisfaction2        -0.99411    0.25952  -3.831 0.000128 ***
## EnvironmentSatisfaction3        -1.09562    0.23010  -4.762 1.92e-06 ***
## EnvironmentSatisfaction4        -1.20054    0.23452  -5.119 3.07e-07 ***
## BusinessTravelTravel_Frequently  1.66670    0.38353   4.346 1.39e-05 ***
## BusinessTravelTravel_Rarely      0.85413    0.35785   2.387 0.016993 *  
## NumCompaniesWorked               0.12553    0.03292   3.813 0.000137 ***
## JobInvolvement2                 -0.97597    0.32700  -2.985 0.002839 ** 
## JobInvolvement3                 -1.40635    0.30906  -4.550 5.35e-06 ***
## JobInvolvement4                 -2.10620    0.43373  -4.856 1.20e-06 ***
## JobSatisfaction2                -0.42584    0.25427  -1.675 0.093976 .  
## JobSatisfaction3                -0.48860    0.22383  -2.183 0.029042 *  
## JobSatisfaction4                -1.08938    0.23978  -4.543 5.54e-06 ***
## DistanceFromHome                 0.04245    0.01011   4.200 2.67e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1298.58  on 1469  degrees of freedom
## Residual deviance:  946.73  on 1443  degrees of freedom
## AIC: 1000.7
## 
## Number of Fisher Scoring iterations: 6
7. Obtain Odd Ratio of covariences
The Estimate in the final model is given in terms of Log Odds. Need to invert to obtain actual Odd Ratios of the top variables and sort output
Looking at the odd ratio table below, we can see that the Sales Representatives category has the highest odds of causing Attrition, this is, Sales representatives are 8 times more likely to have Attrition than any other category. The second highest covariate is Working Overtime. people who work overtime are 5 times more likely to have Atrition. People who travel frequently are 4x more likely to leave. Also Job Roles like Lab technician and HR are more likely to leave, based on the data.
Covarience/Value Odds Ratio
JobRoleSales Representative 8.3669402
OverTimeYes 5.3426139
BusinessTravelTravel_Frequently 4.7865786
JobRoleLaboratory Technician 3.6904708
JobRoleHuman Resources 3.4551899
JobRoleSales Executive 3.1500722
BusinessTravelTravel_Rarely 2.2161456
JobRoleResearch Scientist 1.7691884
JobRoleManager 1.5547907
NumCompaniesWorked 1.119052
DistanceFromHome 1.0362319

Job Roles

Job Roles

Note: All analysis has been performed on employees only (No attrition participants)

Key Findings

1) Manager Job Role:
      a) Only job role that exists in all departments. All other job roles are unique to a department.
      b) Have the longest average duration at the company
2) Managers and Directors
      a) Tend to be older than people in other job roles
      b) Have higher Job Levels
      c) Have higher Monthly Income
3) Manufacturing Director
      a) The only Job Role where Males do not outnumber Females
4) Human Resource Manager
      a) is the only Role which does not have ‘Low’ level of job involvement
      b) No PhDs (characteristic in common with Sale Reps)
      c) All HR Managers have some College level education (a characteristic unique to this Job Role)
5) Sales Reps and Reseach Scientists
      a) have the lowest Job Levels
6) Sales Reps
      a) is the only Job Role which has no employees with NumWorked = 9!!
      b) have the lowest average duration at the company
      c) change managers most frequently, while directors and managers tend to stay with the same manager for a longer time
      d) No PhDs (characteristic in common with HR Managers)
      e) the highest percentage of employees with college-level education (25%)

Departments

ggplot(data=as.data.frame(table(noattrit$Department,noattrit$JobRole,dnn=list("Department","JobRole"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Department)) +
geom_bar(stat="identity") + coord_flip() + ggtitle("Number of Employees per Job Role by Department") + xlab("") + ylab("")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4")))

Linear Regression Modeling (See Experiment Tab) suggests Education Field, Age, Job Involvement, Gender, Job Level, Monthly Income, Number of Companies Worked, Years At Company and Years with Current Manager are different between of Job Roles

Education

ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$EducationField,dnn=list("JobRole","EducationField"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = EducationField)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Percent of Employees per Job Role by Education Field") + xlab("") + ylab("Percent of Employees")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4", "midnightblue", "mistyrose4", "lightcyan3")))

Though not identified as a first order differentiator by linear regression, Education is an interesting parameter in regards to Job Role

ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$Education,dnn=list("JobRole","Education"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Education)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Education by Job Role \nNo PhD Sales Reps nor HR Managers \nAll HR Managers Have Some College") + xlab("") + ylab("% Employees")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4", "mistyrose4", "lightcyan3")))

Observations

All Departments have a mix of educational fields
People with Education in Marketing only work in the Sales Department
People with Education in Human Resources work in the Human Resources Department
R&D is predominantly composed of people with Life Sciences and Medical Education
No PhD Sales Reps nor HR Managers
**All HR Managers Have Some College (unique from all other job roles)“**
Sales Reps have the highest percentage of employees with college-level education (25%)

Age

noattrit$JobRole <-with(noattrit, ifelse(noattrit$JobRole=="Manager", paste(noattrit$Department,noattrit$JobRole), as.character(noattrit$JobRole)))
ggplot(data=noattrit,aes(x= JobRole,y=Age)) +
geom_boxplot() + coord_flip() + ggtitle("Age by Job Role") + xlab("")+ylab("Age")+theme(plot.title=element_text(hjust=0.5))

#Descripitve Statistics of Employee Ages
  summary(noattrit$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   31.00   36.00   37.56   43.00   60.00
  results <- lm(noattrit$JRCode~noattrit$Age)
summary(results)
## 
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$Age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0652 -2.0573  0.9379  1.9400  4.9427 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.0494530  0.2910012  17.352   <2e-16 ***
## noattrit$Age 0.0002632  0.0075984   0.035    0.972    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.31 on 1196 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  1.003e-06,  Adjusted R-squared:  -0.0008351 
## F-statistic: 0.0012 on 1 and 1196 DF,  p-value: 0.9724
Observations

1. Managers and Directors tend be older than people in other roles
2. The oldest employee is a Sales Executive (60 years old)
3. The youngest employee is a Laboratory Technician (18 years old)

Gender

ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$Gender,dnn=list("JobRole","Gender"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = Gender)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Males OutNumber Females in Most Job Roles \n (Delta is not statiscally significant)") + xlab("") + ylab("Percent of Employees")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4"))) + geom_hline(data=noattrit, aes(yintercept = .50), colour="black") 

Compensation

ggplot(data=noattrit,aes(x= JobRole,y=MonthlyIncome,group=JobRole)) + geom_boxplot() + coord_flip() + ggtitle("Managers and Directors Have Higher Compensation \n (p-value < .00001)") + xlab("")+ylab("Monthly Income")+theme(plot.title=element_text(hjust=0.5))

Job Levels

 ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$JobLevel,dnn=list("JobRole","JobLevel"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = JobLevel)) + geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Managers & Directors Tend to Have Higher Job Levels") + xlab("Percent of Employees") + ylab("")+theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4", "mistyrose4", "lightcyan3")))

results <- lm(noattrit$JRCode~noattrit$JobLevel)
summary(results)
## 
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$JobLevel)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.526 -1.795  0.718  2.205  4.718 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.55089    0.14517  31.349  < 2e-16 ***
## noattrit$JobLevel  0.24371    0.06191   3.937 8.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.295 on 1196 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.01279,    Adjusted R-squared:  0.01197 
## F-statistic:  15.5 on 1 and 1196 DF,  p-value: 8.742e-05
Observations:

1. Managers and Directors have higher Job Levels.
2. Sales Reps and Research Scientists have the lowest Job Levels.

Job Involvement

ggplot(data=as.data.frame(table(attrit$JobRole,attrit$JobInvolvement,dnn=list("JobRole","JobInvolvement"))), aes(x= reorder(JobRole,Freq),y=Freq, fill = JobInvolvement)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle(" HR Managers are the only Role \nwithout Low Job Involvement") + xlab("Percent of Employees") + ylab("") +theme(plot.title=element_text(hjust=0.5))+scale_fill_manual(values=alpha(c("steelblue4", "thistle4", "paleturquoise4", "mistyrose4")))

Work Experience

ggplot(data=noattrit,aes(x= JobRole,y=noattrit$NumCompaniesWorked)) +
geom_boxplot() + coord_flip() + ggtitle("Many Employees have Experience at Several Companies") + xlab("Number of Companies")+ylab("") +theme(plot.title=element_text(hjust=0.5))

Observations:

Sales Rep is the only Job Role which has no employees with NumWorked = 9!!

Years at Company

ggplot(data=noattrit,aes(x= JobRole,y=noattrit$YearsAtCompany)) + geom_boxplot() + coord_flip() + ggtitle("Significant Differences between Tenure by Job Role \n(p-value=.0008)") + xlab("Number of Years at Company")+ylab("") +theme(plot.title=element_text(hjust=0.5))

results <- lm(noattrit$JRCode~noattrit$YearsAtCompany)
summary(results)
## 
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$YearsAtCompany)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.710 -1.954  0.756  1.945  5.121 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              4.87899    0.10601  46.025   <2e-16 ***
## noattrit$YearsAtCompany  0.02517    0.01152   2.186    0.029 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.305 on 1196 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.003979,   Adjusted R-squared:  0.003146 
## F-statistic: 4.778 on 1 and 1196 DF,  p-value: 0.02902
Observations

The Job Role “Sales Reps” has the lowest average duration at the company.
The Job Roles of Managers have the longest average duration at the company.

ggplot(data=noattrit,aes(x= JobRole,y=YearsWithCurrManager)) + geom_boxplot() + coord_flip() + ggtitle("Years With Current Manager Vary By Job Role \n (p-value = .0425)") + xlab("")+ylab("Number of Years")+theme(plot.title=element_text(hjust=0.5))

Observations:

Sales Reps change managers most frequently, while directors and managers tend to stay with the same manager for a longer time.

Company Specifics

Key Findings
67% of the Employees in the Company in R&D
The average age of employees in 37.56 years, with the employees in HR average age being slightly higher.
There are 46% more Male employees than Female employees. HR has the largest ratio of Males to Female at 2.6:1
Job Level is strongly correlated to Montly Income. High Job Level = Higher Monthly Incomes.
Human Resource Managers are the only Role which does not have ‘Low’ level of job involvement
50% of Employees have been with the company <= 5 years
People with Marketing education only work in Sales, and people with HR education only work in HR. All departments have a mix of Educational Backgrounds.
Everyone receives a Performance Rating of Excellent or Outstanding.
Work Experience correlates to Job Level and therefore Monthly Income
Monthly Income is based on Work Experience, not Performance. (Job Level is correlated to Total Time Worked and Time At Company. Monthly Income is correlated to Job Level and Total Time Worked. All Performance Ratings are Excellent and Outstanding.)
The longer you work for a manager the longer the time between promotions.
Departments
DepartmentDF <- as.data.frame(table(noattrit$Department))
colnames(DepartmentDF) <- c("Department","DepartmentCount")

DepartmentDF$CoDepartmntPercentage <-paste(100*as.numeric(format(DepartmentDF$DepartmentCount/colSums(DepartmentDF[2]),digits=1)),"%")
DepartmentDF$CoDepartmntPercentage <-paste(100*as.numeric(format(DepartmentDF$DepartmentCount/colSums(DepartmentDF[2]),digits=1)),"%")

 print.data.frame(DepartmentDF[with(DepartmentDF, order(-DepartmentCount)),],row.names=FALSE)
##              Department DepartmentCount CoDepartmntPercentage
##  Research & Development             828                  67 %
##                   Sales             354                  29 %
##         Human Resources              51                   4 %
ggplot(data=as.data.frame(table(noattrit$Department,dnn=list("Department"))), aes(x= reorder(Department,Freq),y=Freq)) +
geom_bar(stat="identity") + coord_flip() + ggtitle("Department Sizes \nno attrition") + xlab("") + ylab("")

Age Distribution
options(width = 150)
ggplot(data=noattrit, aes(x=noattrit$Age,fill=noattrit$Department)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cat("Summary of Age Distribution for the Company")
## Summary of Age Distribution for the Company
summary(noattrit$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   31.00   36.00   37.56   43.00   60.00
cat("Summary of Age Distributions by Department")
## Summary of Age Distributions by Department
aggdata <-aggregate(noattrit$Age, by=list(noattrit$Department), 
FUN=summary, na.rm=TRUE)
print(aggdata)
##                  Group.1 x.Min. x.1st Qu. x.Median x.Mean x.3rd Qu. x.Max.
## 1        Human Resources  24.00     33.50    39.00  39.63     44.50  59.00
## 2 Research & Development  18.00     31.00    36.00  37.62     43.25  60.00
## 3                  Sales  18.00     31.00    36.00  37.14     42.75  60.00
Gender Distribution by Department
ggplot(data=as.data.frame(table(noattrit$Department,noattrit$Gender,dnn=list("Department","Gender"))), aes(x= reorder(Department,Freq),y=Freq, fill = Gender)) + geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Gender by Department") + xlab("") + ylab("% Employees") + geom_hline(data=noattrit, aes(yintercept = .50), colour="black") 

ggplot(data=noattrit, aes(x=noattrit$Gender,fill=Gender)) + geom_bar() + ggtitle("There are 46% more Males than Females in the Company \np-value=.04") + xlab("Gender")

#Recode Department to a numeric for t-test analysis
noattrit$DeptCode[noattrit$Department=="Research & Development"] <- 1L
noattrit$DeptCode[noattrit$Department=="Sales"] <- 2L
noattrit$DeptCode[noattrit$Department=="Human Resources"] <- 3L
t.test(table(noattrit$DeptCode, noattrit$Gender))
## 
##  One Sample t-test
## 
## data:  table(noattrit$DeptCode, noattrit$Gender)
## t = 2.7531, df = 5, p-value = 0.04016
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   13.62051 397.37949
## sample estimates:
## mean of x 
##     205.5
ggplot(data=noattrit,aes(x=noattrit$JobLevel, y=noattrit$MonthlyIncome)) + geom_point()  + ggtitle("Monthly Income is correlated to Job Level \n(p-value=<.0001") + ylab("Monthly Income") +xlab("Job Level") + geom_smooth(method = 'lm', se = FALSE)

results <- lm(noattrit$MonthlyIncome~noattrit$JobLevel)
summary(results)
## 
## Call:
## lm(formula = noattrit$MonthlyIncome ~ noattrit$JobLevel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5119.4 -1021.7   136.6   810.0  3763.0 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1953.62      92.83  -21.05   <2e-16 ***
## noattrit$JobLevel  4094.33      38.37  106.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1505 on 1231 degrees of freedom
## Multiple R-squared:  0.9025, Adjusted R-squared:  0.9024 
## F-statistic: 1.139e+04 on 1 and 1231 DF,  p-value: < 2.2e-16
Job Involvement
 ggplot(data=as.data.frame(table(noattrit$Department,noattrit$JobInvolvement,dnn=list("Department","JobInvolvement"))), aes(x= reorder(Department,Freq),y=Freq, fill = JobInvolvement)) + geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("4.4% of the Company has a Low Level of Job Involvement") + xlab("") + ylab("") + geom_hline(data=noattrit, aes(yintercept = .9554), colour="black")

t.test(table(noattrit$DeptCode, noattrit$JobInvolvement))
## 
##  One Sample t-test
## 
## data:  table(noattrit$DeptCode, noattrit$JobInvolvement)
## t = 2.5003, df = 11, p-value = 0.02949
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   12.29995 193.20005
## sample estimates:
## mean of x 
##    102.75

Number of Years with Current Manager

ggplot(data=noattrit, aes(x=factor(noattrit$YearsWithCurrManager))) + geom_histogram(stat="count",position = "dodge") +xlab("Years") + scale_x_discrete(name='Years With Current Manager') + ggtitle("Years With Current Manager")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Observations:
Multiple Modal Distribution
Education
cat("Percentage of Employees with No College-Level Education:",paste(100*as.numeric(format(nrow(as.data.frame(noattrit[( noattrit$Education == 1),]))/1233,digits=1)),"%"))
## Percentage of Employees with No College-Level Education: 10 %
ggplot(data=noattrit, aes(x=factor(noattrit$Education))) + geom_histogram(stat="count",position = "dodge") +xlab(" ") + scale_x_discrete(name='Education') + ggtitle("Education Level")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Compensation is based on Work Experience not Performance

Job Level vs Total Working Years
ggplot(data=noattrit,aes(x= JobLevel,y=noattrit$TotalWorkingYears, group=JobLevel)) + geom_boxplot() + coord_flip() + ggtitle("Work Experience Correlates to Job Level") + xlab("Job Level")+ylab("Years")

#####Working Years vs Monthly Income

ggplot(data=noattrit,aes(x=noattrit$TotalWorkingYears,
y=noattrit$MonthlyIncome,col=factor(noattrit$JobLevel))) +
geom_point()  + ggtitle("Total Working Years is Correlated to Job Level \np-value <.00001") + xlab("")+ylab("")

results <- lm(noattrit$JobLevel~noattrit$TotalWorkingYears)
summary(results)
## 
## Call:
## lm(formula = noattrit$JobLevel ~ noattrit$TotalWorkingYears)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.06151 -0.48633  0.06362  0.40116  2.17613 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.811251   0.036340   22.32   <2e-16 ***
## noattrit$TotalWorkingYears 0.112513   0.002564   43.89   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6984 on 1231 degrees of freedom
## Multiple R-squared:  0.6101, Adjusted R-squared:  0.6098 
## F-statistic:  1926 on 1 and 1231 DF,  p-value: < 2.2e-16

Total Time At Company vs Monthly Income

ggplot(data=noattrit,aes(x=noattrit$YearsAtCompany, y=noattrit$MonthlyIncome,col=factor(noattrit$JobLevel))) +
geom_point()  + ggtitle("Years At Company vs Monthly Income by JobLevel") + xlab("")+ylab("")

 results <- lm(noattrit$MonthlyIncome ~ noattrit$JobLevel + noattrit$TotalWorkingYears + noattrit$YearsAtCompany)
 summary(results)
## 
## Call:
## lm(formula = noattrit$MonthlyIncome ~ noattrit$JobLevel + noattrit$TotalWorkingYears + 
##     noattrit$YearsAtCompany)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5295.9 -1004.0    90.5   849.6  3964.0 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -1943.842     92.416 -21.034  < 2e-16 ***
## noattrit$JobLevel           3848.637     60.956  63.138  < 2e-16 ***
## noattrit$TotalWorkingYears    53.394      9.388   5.688 1.61e-08 ***
## noattrit$YearsAtCompany      -15.735      8.680  -1.813   0.0701 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1487 on 1229 degrees of freedom
## Multiple R-squared:  0.905,  Adjusted R-squared:  0.9047 
## F-statistic:  3901 on 3 and 1229 DF,  p-value: < 2.2e-16
Performance Rating
ggplot(data=noattrit, aes(x=factor(noattrit$PerformanceRating))) + geom_histogram(stat="count",position = "dodge") +xlab(" ") + scale_x_discrete(name='Education') + ggtitle("Everyone Receives a Performance Rating of Excellent or Outstanding")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

ggplot(data=noattrit,aes(x= PerformanceRating, y=noattrit$MonthlyIncome, group=PerformanceRating)) + geom_boxplot() + ggtitle("Monthly Income by Performance Rating") + xlab("Performance Rating")+ylab("Monthly Income")

Years Since Promotion
ggplot(data=noattrit,aes(x=noattrit$YearsWithCurrManager, y=noattrit$YearsSinceLastPromotion)) +
geom_point()  + ggtitle("The Longer You Stay with a Manager the Longer Time Between Promotions") + xlab("Years With Current Manager")+ylab("Years Since Last Promotion") + stat_smooth(method="lm", se=FALSE)

results <- lm(noattrit$YearsSinceLastPromotion ~ noattrit$YearsWithCurrManager)
summary(results)
## 
## Call:
## lm(formula = noattrit$YearsSinceLastPromotion ~ noattrit$YearsWithCurrManager)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8662 -1.6387 -0.3318  0.7969 14.6682 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    0.33183    0.12694   2.614  0.00906 ** 
## noattrit$YearsWithCurrManager  0.43563    0.02245  19.407  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.832 on 1231 degrees of freedom
## Multiple R-squared:  0.2343, Adjusted R-squared:  0.2337 
## F-statistic: 376.6 on 1 and 1231 DF,  p-value: < 2.2e-16
ggplot(data=noattrit,aes(x= JobLevel,y=noattrit$YearsSinceLastPromotion, group=JobLevel)) + geom_boxplot() + coord_flip() + ggtitle("Time Since Last Promotion Increases as Job Level Increases") + xlab("Job Level")+ylab("Years")

Percent Salary Hike

ggplot(data=noattrit,aes(x= JobLevel,y=noattrit$PercentSalaryHike, group=JobLevel)) + geom_boxplot() + coord_flip() + ggtitle("All Job Levels Received About 15% Average Salary Increase \nwith roughly similar distributions") + xlab("Job Level")+ylab("% Salary Increase")

Job Satisfaction

Key Findings
1. Job Satisfaction decreaseas as Hourly rate
2. PhDs have the highest satisfaction
3. People with Bachelor Degrees are the most dissatified
4. Satisfaction in consistent by Gender
5. Satisfaction varies by Job Role with R&D Manager having the lowest satisfaction and Sales Reps and HR Managers having the highest
Hourly Rate
Suggested by Linear Regression Modeling (See Experimentation Tab) as weak factor for Job Satisfaction (other potential factors were identified but did not pass on review)
ggplot(data=noattrit,aes(x= JobSatisfaction,y=noattrit$HourlyRate, group=JobSatisfaction)) + geom_boxplot() + coord_flip() + ggtitle("Job Satisfaction Decreases as Hourly Rate Decreases \np-value=.05") + xlab("Job Satisfaction")+ylab("Hourly Rate")

Education

ggplot(data=as.data.frame(table(noattrit$Education,noattrit$JobSatisfaction,dnn=list("Education","JobSatisfaction"))), aes(x=Education,y=Freq, fill = JobSatisfaction)) + geom_bar(stat="identity",position="fill") + ggtitle("PhDs Have the Highest Job Satisfaction \n People with Bachelor Degrees Have the Lowest Job Satisfaction \np-value=.01263") + xlab("Education Level") + ylab("") + geom_abline(slope=0, intercept=0.5,  col = "black",lty=2) + coord_flip()

ggplot(data=as.data.frame(table(noattrit$Education,noattrit$JSCode,dnn=list("Education","JobSatisfaction"))), aes(x=Education,y=Freq, fill = JobSatisfaction)) + geom_bar(stat="identity",position="fill") + ggtitle("Job Satisfaction is Varies by Education") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.9,  col = "black",lty=2) + coord_flip()

t.test(table(noattrit$JSCodeNum, noattrit$Education))
## 
##  One Sample t-test
## 
## data:  table(noattrit$JSCodeNum, noattrit$Education)
## t = 3.1046, df = 9, p-value = 0.01263
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   33.45881 213.14119
## sample estimates:
## mean of x 
##     123.3

Gender

ggplot(data=as.data.frame(table(noattrit$Gender,noattrit$JobSatisfaction,dnn=list("Gender","JobSatisfaction"))), aes(x=Gender,y=Freq, fill = JobSatisfaction)) +
geom_bar(stat="identity",position="fill") + ggtitle("Job Satisfaction is Consistent Across Genders") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.5,  col = "black",lty=2)

Job Role

ggplot(data=as.data.frame(table(noattrit$JobRole,noattrit$JobSatisfaction,dnn=list("JobRole","JobSatisfaction"))), aes(x=JobRole,y=Freq, fill = JobSatisfaction)) + geom_bar(stat="identity",position="fill") + ggtitle("Job Satisfaction is Varies by Job Roles \nTop Dissatisfied Roles:R&D Director  \nMost Satsifed: Sales Reps and HR Manager") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.9,  col = "black",lty=2) + coord_flip()

t.test(table(noattrit$JSCodeNum, noattrit$JobRole))
## 
##  One Sample t-test
## 
## data:  table(noattrit$JSCodeNum, noattrit$JobRole)
## t = 4.1555, df = 21, p-value = 0.000448
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  27.99778 84.09313
## sample estimates:
## mean of x 
##  56.04545

Interesting Findings

Key Findings:
NumCompaniesWorked has evidence of ambiguity in the data and is perhaps miscoded or inaccurate. (See discussion below).
No relationship between the difference compensation rates. Would have expected a multiplicative relationship between DailyRate and Hourly Rate, or Daily Rate and Monthly Rate, etc.
Observaton:
Strong Correlation between observation incoming order and employee number
Number of Companies Worked
summary(noattrit$NumCompaniesWorked)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.000   2.646   4.000   9.000
ggplot(data=noattrit, aes(x=factor(noattrit$NumCompaniesWorked))) + geom_histogram(stat="count",position = "dodge") +xlab("Number CompaniesWorked") + scale_x_discrete(name='Number of Companies') +ggtitle("Number of Companies Worked")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Observations:
There maybe confusion in the coding of this variable. What does 0 mean? Since these are employees at this company everyone has worked at least one company. Or maybe the variable is to be interpreted as the number of companies worked other than this one. Seems unlikely as the most frequent value are

Compensation Rates - No correlations

Monthly Income vs Monthly Rate
ggplot(data=noattrit,aes(x=noattrit$MonthlyRate, y=noattrit$MonthlyIncome)) +
geom_point()  + ggtitle(" No Correlation between Monthly Income vs Monthly Rate \n p-value=.261") + xlab("Monthly Rate")+ylab("Monthly Income") + stat_smooth(method="lm", se=FALSE)

result <- lm(noattrit$MonthlyIncome ~ noattrit$MonthlyRate)
summary(result)
## 
## Call:
## lm(formula = noattrit$MonthlyIncome ~ noattrit$MonthlyRate)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5979  -3597  -1645   1986  13353 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.522e+03  3.079e+02  21.180   <2e-16 ***
## noattrit$MonthlyRate 2.175e-02  1.933e-02   1.126    0.261    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4818 on 1231 degrees of freedom
## Multiple R-squared:  0.001028,   Adjusted R-squared:  0.0002168 
## F-statistic: 1.267 on 1 and 1231 DF,  p-value: 0.2605
ggplot(data=noattrit,aes(x=noattrit$DailyRate, y=noattrit$MonthlyIncome)) +
geom_point()  + ggtitle(" No Correlation between Monthly Income vs Daily Rate") + xlab("Daily Rate")+ylab("Monthly Income") + stat_smooth(method="lm", se=FALSE)

ggplot(data=noattrit,aes(x=noattrit$DailyRate, y=noattrit$MonthlyRate)) +
geom_point()  + ggtitle(" No Correlation between Monthly Rate & Daily Rate") + xlab("Daily Rate")+ylab("Monthly Rate") + stat_smooth(method="lm", se=FALSE)

ggplot(data=noattrit,aes(x=noattrit$HourlyRate, y=noattrit$DailyRate)) +
geom_point()  + ggtitle(" No Correlation between Hourly Rate & Daily Rate") + xlab("Hourly Rate")+ylab("Daily Rate") + stat_smooth(method="lm", se=FALSE)

ggplot(data=noattrit,aes(x=noattrit$YearsAtCompany, y=noattrit$DailyRate,col=factor(noattrit$JobLevel))) +
geom_point()  + ggtitle("Daily Rate vs Years At Company by JobLevel") + xlab("Years")+ylab("Daily Rate")

The strong correlations found between TimeAtCompany & Total Working Time with Monthly Income are not evident with DailyRate, Hourly Rate nor Monthly Rate.

Attempted to determine relationships using log-log, linear-log, log-linear transformations and multiplicative models.

Experimentation

1.Employee Number - a recoded start date?….No 2.Linear Regression Modeling to id first order variables to differentiate factor variables

Used Linear Regression Modeling to determine variables of significance between Job Roles

Important first order terms in the model are: EducationField,Age,JobLevel,MonthlyIncome, Gender, JobInvolvement,NumCompaniesWorked,YearsAtCompany,YearsWithCurrManager

options(width = 150)

#Run a linear regression model using all variables to id the important first order terms
jr.lm <- lm(noattrit$JRCode ~ noattrit$BusinessTravel+ noattrit$EducationField+ noattrit$Gender+ noattrit$OverTime+ noattrit$MaritalStatus+ noattrit$Age+ noattrit$DailyRate+ 
noattrit$DistanceFromHome+ noattrit$Education+ noattrit$EmployeeNumber+noattrit$EnvironmentSatisfaction+ noattrit$HourlyRate+ noattrit$JobInvolvement+ noattrit$JobLevel+ noattrit$JobSatisfaction+
 noattrit$MonthlyIncome+ noattrit$MonthlyRate+ noattrit$NumCompaniesWorked+ noattrit$PercentSalaryHike+ noattrit$PerformanceRating+ noattrit$RelationshipSatisfaction+ noattrit$StockOptionLevel+ 
noattrit$TotalWorkingYears+ noattrit$TrainingTimesLastYear+ noattrit$WorkLifeBalance+ noattrit$YearsAtCompany+ noattrit$YearsInCurrentRole+ noattrit$YearsSinceLastPromotion+ noattrit$YearsWithCurrManager)
summary(jr.lm)
## 
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$BusinessTravel + noattrit$EducationField + 
##     noattrit$Gender + noattrit$OverTime + noattrit$MaritalStatus + 
##     noattrit$Age + noattrit$DailyRate + noattrit$DistanceFromHome + 
##     noattrit$Education + noattrit$EmployeeNumber + noattrit$EnvironmentSatisfaction + 
##     noattrit$HourlyRate + noattrit$JobInvolvement + noattrit$JobLevel + 
##     noattrit$JobSatisfaction + noattrit$MonthlyIncome + noattrit$MonthlyRate + 
##     noattrit$NumCompaniesWorked + noattrit$PercentSalaryHike + 
##     noattrit$PerformanceRating + noattrit$RelationshipSatisfaction + 
##     noattrit$StockOptionLevel + noattrit$TotalWorkingYears + 
##     noattrit$TrainingTimesLastYear + noattrit$WorkLifeBalance + 
##     noattrit$YearsAtCompany + noattrit$YearsInCurrentRole + noattrit$YearsSinceLastPromotion + 
##     noattrit$YearsWithCurrManager)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5355 -1.6075 -0.1949  1.8112  4.7772 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                               4.410e+00  1.087e+00   4.056 5.33e-05 ***
## noattrit$BusinessTravelTravel_Frequently -1.847e-01  2.466e-01  -0.749  0.45416    
## noattrit$BusinessTravelTravel_Rarely     -9.590e-02  2.067e-01  -0.464  0.64281    
## noattrit$EducationFieldLife Sciences      1.170e+00  5.093e-01   2.297  0.02180 *  
## noattrit$EducationFieldMarketing          2.600e+00  5.437e-01   4.782 1.95e-06 ***
## noattrit$EducationFieldMedical            1.083e+00  5.123e-01   2.114  0.03469 *  
## noattrit$EducationFieldOther              1.002e+00  5.646e-01   1.774  0.07633 .  
## noattrit$EducationFieldTechnical Degree   1.435e+00  5.486e-01   2.616  0.00901 ** 
## noattrit$GenderMale                      -2.214e-01  1.319e-01  -1.678  0.09361 .  
## noattrit$OverTimeYes                     -4.345e-02  1.551e-01  -0.280  0.77937    
## noattrit$MaritalStatusMarried             1.003e-01  1.698e-01   0.591  0.55479    
## noattrit$MaritalStatusSingle              2.164e-01  2.398e-01   0.902  0.36701    
## noattrit$Age                             -1.781e-02  1.024e-02  -1.739  0.08234 .  
## noattrit$DailyRate                       -2.736e-05  1.609e-04  -0.170  0.86496    
## noattrit$DistanceFromHome                -4.919e-03  8.074e-03  -0.609  0.54250    
## noattrit$Education                       -5.459e-03  6.475e-02  -0.084  0.93283    
## noattrit$EmployeeNumber                  -1.061e-04  1.064e-04  -0.998  0.31872    
## noattrit$EnvironmentSatisfaction         -1.516e-02  6.083e-02  -0.249  0.80328    
## noattrit$HourlyRate                      -2.095e-03  3.171e-03  -0.661  0.50902    
## noattrit$JobInvolvement                   2.160e-01  9.312e-02   2.320  0.02051 *  
## noattrit$JobLevel                        -1.073e+00  1.932e-01  -5.555 3.44e-08 ***
## noattrit$JobSatisfaction                  4.070e-02  5.940e-02   0.685  0.49335    
## noattrit$MonthlyIncome                    3.457e-04  4.427e-05   7.810 1.27e-14 ***
## noattrit$MonthlyRate                      3.592e-06  9.061e-06   0.396  0.69182    
## noattrit$NumCompaniesWorked              -6.608e-02  2.965e-02  -2.229  0.02603 *  
## noattrit$PercentSalaryHike               -5.725e-03  2.766e-02  -0.207  0.83609    
## noattrit$PerformanceRating                8.678e-02  2.784e-01   0.312  0.75531    
## noattrit$RelationshipSatisfaction        -3.219e-03  6.047e-02  -0.053  0.95755    
## noattrit$StockOptionLevel                 3.163e-02  1.037e-01   0.305  0.76049    
## noattrit$TotalWorkingYears                1.107e-03  1.807e-02   0.061  0.95118    
## noattrit$TrainingTimesLastYear           -1.385e-02  5.001e-02  -0.277  0.78181    
## noattrit$WorkLifeBalance                 -4.880e-02  9.561e-02  -0.510  0.60989    
## noattrit$YearsAtCompany                   4.173e-02  2.366e-02   1.764  0.07807 .  
## noattrit$YearsInCurrentRole              -2.674e-02  2.927e-02  -0.913  0.36118    
## noattrit$YearsSinceLastPromotion          1.450e-02  2.607e-02   0.556  0.57803    
## noattrit$YearsWithCurrManager            -7.326e-02  2.980e-02  -2.459  0.01409 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.206 on 1162 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.114,  Adjusted R-squared:  0.08734 
## F-statistic: 4.273 on 35 and 1162 DF,  p-value: 6.053e-15
#Check revised model fit
jr.lm <- lm(noattrit$JRCode ~noattrit$EducationField+ noattrit$Age + noattrit$JobLevel+ noattrit$MonthlyIncome+ noattrit$Gender+ noattrit$JobInvolvement+ noattrit$NumCompaniesWorked+ noattrit$YearsAtCompany+ noattrit$YearsWithCurrManager)
summary(jr.lm)
## 
## Call:
## lm(formula = noattrit$JRCode ~ noattrit$EducationField + noattrit$Age + 
##     noattrit$JobLevel + noattrit$MonthlyIncome + noattrit$Gender + 
##     noattrit$JobInvolvement + noattrit$NumCompaniesWorked + noattrit$YearsAtCompany + 
##     noattrit$YearsWithCurrManager)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3740 -1.5687 -0.2193  1.8478  4.8356 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                              4.276e+00  6.369e-01   6.714 2.93e-11 ***
## noattrit$EducationFieldLife Sciences     1.141e+00  5.016e-01   2.274  0.02313 *  
## noattrit$EducationFieldMarketing         2.554e+00  5.357e-01   4.768 2.09e-06 ***
## noattrit$EducationFieldMedical           1.057e+00  5.037e-01   2.098  0.03610 *  
## noattrit$EducationFieldOther             9.505e-01  5.568e-01   1.707  0.08809 .  
## noattrit$EducationFieldTechnical Degree  1.389e+00  5.399e-01   2.573  0.01021 *  
## noattrit$Age                            -1.773e-02  8.586e-03  -2.065  0.03915 *  
## noattrit$JobLevel                       -1.073e+00  1.881e-01  -5.702 1.50e-08 ***
## noattrit$MonthlyIncome                   3.465e-04  4.324e-05   8.015 2.63e-15 ***
## noattrit$GenderMale                     -2.246e-01  1.298e-01  -1.731  0.08378 .  
## noattrit$JobInvolvement                  2.057e-01  9.152e-02   2.247  0.02481 *  
## noattrit$NumCompaniesWorked             -6.591e-02  2.820e-02  -2.337  0.01958 *  
## noattrit$YearsAtCompany                  3.656e-02  1.897e-02   1.927  0.05418 .  
## noattrit$YearsWithCurrManager           -7.996e-02  2.815e-02  -2.840  0.00459 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.191 on 1184 degrees of freedom
##   (35 observations deleted due to missingness)
## Multiple R-squared:  0.1094, Adjusted R-squared:  0.09957 
## F-statistic: 11.18 on 13 and 1184 DF,  p-value: < 2.2e-16

The fit is not great, but perhaps sufficient enough to identify the important fit order terms to explore Job Roles

Linear Regression Modeling for Job Satisfaction first order factors
js.lm <- lm(noattrit$JobSatisfaction ~ noattrit$BusinessTravel+ noattrit$EducationField+ noattrit$Gender+ noattrit$OverTime+ noattrit$MaritalStatus+ noattrit$Age+ noattrit$DailyRate+ noattrit$DistanceFromHome+ noattrit$Education+ noattrit$EmployeeNumber+ noattrit$EnvironmentSatisfaction+ noattrit$HourlyRate+ noattrit$JobInvolvement+ noattrit$JobLevel+ noattrit$JobRole+ noattrit$MonthlyIncome+ noattrit$MonthlyRate+ noattrit$NumCompaniesWorked+ noattrit$PercentSalaryHike+ noattrit$PerformanceRating+ noattrit$RelationshipSatisfaction+ noattrit$StockOptionLevel+ noattrit$TotalWorkingYears+ noattrit$TrainingTimesLastYear+ noattrit$WorkLifeBalance+ noattrit$YearsAtCompany+ noattrit$YearsInCurrentRole+ noattrit$YearsSinceLastPromotion+ noattrit$YearsWithCurrManager)
summary(js.lm)
## 
## Call:
## lm(formula = noattrit$JobSatisfaction ~ noattrit$BusinessTravel + 
##     noattrit$EducationField + noattrit$Gender + noattrit$OverTime + 
##     noattrit$MaritalStatus + noattrit$Age + noattrit$DailyRate + 
##     noattrit$DistanceFromHome + noattrit$Education + noattrit$EmployeeNumber + 
##     noattrit$EnvironmentSatisfaction + noattrit$HourlyRate + 
##     noattrit$JobInvolvement + noattrit$JobLevel + noattrit$JobRole + 
##     noattrit$MonthlyIncome + noattrit$MonthlyRate + noattrit$NumCompaniesWorked + 
##     noattrit$PercentSalaryHike + noattrit$PerformanceRating + 
##     noattrit$RelationshipSatisfaction + noattrit$StockOptionLevel + 
##     noattrit$TotalWorkingYears + noattrit$TrainingTimesLastYear + 
##     noattrit$WorkLifeBalance + noattrit$YearsAtCompany + noattrit$YearsInCurrentRole + 
##     noattrit$YearsSinceLastPromotion + noattrit$YearsWithCurrManager)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3281 -0.8023  0.1658  1.0144  1.7461 
## 
## Coefficients:
##                                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                                     3.388e+00  5.759e-01   5.884  5.2e-09 ***
## noattrit$BusinessTravelTravel_Frequently        3.423e-05  1.213e-01   0.000   0.9998    
## noattrit$BusinessTravelTravel_Rarely           -1.124e-01  1.015e-01  -1.107   0.2686    
## noattrit$EducationFieldLife Sciences           -9.068e-02  3.226e-01  -0.281   0.7787    
## noattrit$EducationFieldMarketing               -3.070e-01  3.443e-01  -0.892   0.3727    
## noattrit$EducationFieldMedical                 -2.194e-01  3.244e-01  -0.676   0.4991    
## noattrit$EducationFieldOther                   -2.565e-01  3.439e-01  -0.746   0.4559    
## noattrit$EducationFieldTechnical Degree        -1.939e-01  3.393e-01  -0.571   0.5678    
## noattrit$GenderMale                             8.196e-02  6.457e-02   1.269   0.2046    
## noattrit$OverTimeYes                            1.514e-01  7.572e-02   2.000   0.0457 *  
## noattrit$MaritalStatusMarried                   6.284e-02  8.295e-02   0.758   0.4488    
## noattrit$MaritalStatusSingle                    2.900e-01  1.170e-01   2.479   0.0133 *  
## noattrit$Age                                    5.861e-04  5.039e-03   0.116   0.9074    
## noattrit$DailyRate                              7.588e-05  7.867e-05   0.965   0.3350    
## noattrit$DistanceFromHome                       2.969e-03  3.932e-03   0.755   0.4503    
## noattrit$Education                              6.693e-03  3.164e-02   0.211   0.8325    
## noattrit$EmployeeNumber                        -6.788e-05  5.215e-05  -1.302   0.1933    
## noattrit$EnvironmentSatisfaction               -3.693e-02  2.976e-02  -1.241   0.2150    
## noattrit$HourlyRate                            -3.215e-03  1.548e-03  -2.076   0.0381 *  
## noattrit$JobInvolvement                        -5.204e-02  4.582e-02  -1.136   0.2563    
## noattrit$JobLevel                               2.744e-02  1.039e-01   0.264   0.7918    
## noattrit$JobRoleHuman Resources                -1.049e-01  2.361e-01  -0.444   0.6569    
## noattrit$JobRoleHuman Resources Manager        -7.513e-02  4.247e-01  -0.177   0.8596    
## noattrit$JobRoleLaboratory Technician           1.770e-02  1.447e-01   0.122   0.9027    
## noattrit$JobRoleManufacturing Director         -9.222e-02  1.380e-01  -0.668   0.5041    
## noattrit$JobRoleResearch & Development Manager -4.119e-02  2.440e-01  -0.169   0.8660    
## noattrit$JobRoleResearch Director              -7.286e-02  2.153e-01  -0.338   0.7351    
## noattrit$JobRoleResearch Scientist              5.290e-02  1.407e-01   0.376   0.7070    
## noattrit$JobRoleSales Executive                 7.784e-02  1.301e-01   0.598   0.5497    
## noattrit$JobRoleSales Manager                   4.968e-02  2.636e-01   0.188   0.8506    
## noattrit$JobRoleSales Representative            8.984e-02  2.031e-01   0.442   0.6583    
## noattrit$MonthlyIncome                          4.124e-06  2.776e-05   0.149   0.8819    
## noattrit$MonthlyRate                           -5.035e-06  4.429e-06  -1.137   0.2558    
## noattrit$NumCompaniesWorked                    -2.026e-02  1.441e-02  -1.406   0.1600    
## noattrit$PercentSalaryHike                      1.693e-02  1.361e-02   1.244   0.2137    
## noattrit$PerformanceRating                     -3.150e-02  1.368e-01  -0.230   0.8180    
## noattrit$RelationshipSatisfaction              -4.988e-02  2.961e-02  -1.685   0.0923 .  
## noattrit$StockOptionLevel                       5.927e-02  5.093e-02   1.164   0.2447    
## noattrit$TotalWorkingYears                     -3.949e-03  8.865e-03  -0.446   0.6560    
## noattrit$TrainingTimesLastYear                 -5.648e-03  2.459e-02  -0.230   0.8184    
## noattrit$WorkLifeBalance                       -2.412e-02  4.651e-02  -0.519   0.6041    
## noattrit$YearsAtCompany                        -8.128e-05  1.095e-02  -0.007   0.9941    
## noattrit$YearsInCurrentRole                     2.683e-03  1.387e-02   0.193   0.8466    
## noattrit$YearsSinceLastPromotion               -7.644e-03  1.242e-02  -0.615   0.5384    
## noattrit$YearsWithCurrManager                  -1.587e-02  1.442e-02  -1.101   0.2712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.091 on 1188 degrees of freedom
## Multiple R-squared:  0.0395, Adjusted R-squared:  0.003926 
## F-statistic:  1.11 on 44 and 1188 DF,  p-value: 0.2889
js.lm <- lm(noattrit$JobSatisfaction ~noattrit$OverTime+noattrit$MaritalStatus+noattrit$HourlyRate + noattrit$RelationshipSatisfaction)

summary(js.lm)
## 
## Call:
## lm(formula = noattrit$JobSatisfaction ~ noattrit$OverTime + noattrit$MaritalStatus + 
##     noattrit$HourlyRate + noattrit$RelationshipSatisfaction)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1355 -0.8005  0.1982  1.1236  1.4946 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        3.018010   0.143969  20.963   <2e-16 ***
## noattrit$OverTimeYes               0.132104   0.073547   1.796   0.0727 .  
## noattrit$MaritalStatusMarried      0.037520   0.077789   0.482   0.6297    
## noattrit$MaritalStatusSingle       0.191990   0.086500   2.220   0.0266 *  
## noattrit$HourlyRate               -0.003140   0.001524  -2.060   0.0396 *  
## noattrit$RelationshipSatisfaction -0.049647   0.029088  -1.707   0.0881 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.089 on 1227 degrees of freedom
## Multiple R-squared:  0.01252,    Adjusted R-squared:  0.008492 
## F-statistic:  3.11 on 5 and 1227 DF,  p-value: 0.00852
#####Note:  The fit is poor, but may provide directionality as to first order factors to assess.
Marital Status
ggplot(data=as.data.frame(table(attrit$JobSatisfaction,attrit$MaritalStatus,dnn=list("JobSatisfaction","MaritalStatus"))), aes(x= reorder(JobSatisfaction,Freq),y=Freq, fill = MaritalStatus)) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Job Satisfaction by Marital Status is Consistent") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.5,  col = "black",lty=2)

Relationship Satisfaction
ggplot(data=as.data.frame(table(attrit$JobSatisfaction,attrit$RelationshipSatisfaction,dnn=list("JobSatisfaction","RelationshipSatisfaction"))), aes(x= reorder(JobSatisfaction,Freq),y=Freq, fill = factor(RelationshipSatisfaction))) +
geom_bar(stat="identity",position="fill") + coord_flip() + ggtitle("Job Satisfaction by Relationship Satisfaction is Consistent") + xlab("") + ylab("") + geom_abline(slope=0, intercept=0.5,  col = "black",lty=2)

Find common demographics for people with high/lo job satisfaction
JS.4 <- noattrit[noattrit$JobSatisfaction==4,]
JS.3 <- noattrit[noattrit$JobSatisfaction==3,]
highjs <- rbind(JS.3, JS.4)
demohighjs<- highjs[1:8]
demohighjs$Attrition <- NULL
demohighjs$BusinessTravel <- NULL
demohighjs$OverTime <- NULL
demohighjs$MaritalStatus <- NULL
countdemohighjs<- count(demohighjs)
head(countdemohighjs[order(countdemohighjs$freq,decreasing = TRUE),],10)
##                Department EducationField Gender                   JobRole freq
## 21 Research & Development  Life Sciences   Male        Research Scientist   53
## 60                  Sales  Life Sciences   Male           Sales Executive   39
## 17 Research & Development  Life Sciences   Male     Laboratory Technician   38
## 33 Research & Development        Medical   Male        Research Scientist   35
## 16 Research & Development  Life Sciences   Male Healthcare Representative   31
## 66                  Sales      Marketing   Male           Sales Executive   30
## 29 Research & Development        Medical   Male     Laboratory Technician   29
## 15 Research & Development  Life Sciences Female        Research Scientist   28
## 57                  Sales  Life Sciences Female           Sales Executive   25
## 63                  Sales      Marketing Female           Sales Executive   24